Using the bag of words generated from the corpus, and the topic models generated, I now use them to project publications in the LDA topic space, as well as for the individuals in the School
import pandas as pd
import numpy as np
import cPickle as pkl
import matplotlib.pyplot as plt
%matplotlib inline
rng = np.random.RandomState(1234567)
from gensim import models
from gensim.corpora import Dictionary
lookup_pub = pkl.load(open('../infnet-analysis/data/lookup_pub.pkl', 'rb'))
lookup_poinf = pkl.load(open('../infnet-analysis/data/lookup_poinf.pkl','rb'))
pub_toks = pkl.load(open('../infnet-scrapper/data/pub_toks.pkl','rb'))
from hdbscan import HDBSCAN ## TRYING WITH HDBSCAN: http://hdbscan.readthedocs.io/en/latest/basic_hdbscan.html
# Load Dictionary to convert words to id:
dictionary = pkl.load(open('../topicModel/dictionary.pkl','rb'))
# Convert tokens to bow:
bowified = lambda row: dictionary.doc2bow(row.summary_toks)
pub_toks['bow'] = pub_toks.apply(bowified, axis=1)
pub_toks.head(4)
# load the LDA models:
fullpubLDA = models.LdaModel.load('fullpub.ldamodel')
def inference(ldaModel, ldaVector):
num_topics = ldaModel.num_topics
topic_dist = ldaModel[ldaVector]
# index the topic_distribution according to the distribution:
out = [0]*num_topics
for (i,v) in topic_dist:
out[i] = v
assert len(out) == num_topics
return out
_inference = lambda row: inference(fullpubLDA, row.bow)
pub_toks['topic_distribution'] = pub_toks.apply(_inference, axis=1)
pub_toks.head(4)
def best_topic(topic_dist):
"""
Assign the pulbication with the topic that best describes it;
this is equivalent to the index that have the highest topic_distribution
"""
a = np.argmax(topic_dist)
assert a < 20
assert a >= 0
return a
pub_toks['best_topic'] = pub_toks.apply(lambda row: best_topic(row.topic_distribution), axis=1)
pub_toks.head(4)
## Our dataset will be the topic_distribution:
data = pub_toks.topic_distribution.values
We can visualise the data in 2D, and color each publication based on the most salient topic it is on:
from sklearn import manifold
TSNE¶Here, use the manifold package from sklearn to reduce the dimensionality of the data for visualisation.
For coloring, use the topic that gives the highest probability
x_components = manifold.TSNE(n_components=2, init='pca', random_state=rng).fit_transform(data)
f = plt.figure(figsize=(10,10))
ax = f.add_subplot(111)
ax.scatter(x_components[:,0], x_components[:,1], c=list(pub_toks.best_topic), cmap=plt.cm.jet, s=8)
ax.legend(loc='best')
ax.axis('off')
plt.show()
MDS¶# Multidimensional Scaling
mds_components = manifold.MDS(n_components=2, random_state=rng).fit_transform(data)
f = plt.figure(figsize=(10,10))
ax = f.add_subplot(111)
ax.scatter(mds_components[:,0], mds_components[:,1], c=list(pub_toks.best_topic), cmap=plt.cm.jet, s=8)
ax.legend(loc='best')
ax.axis('off')
plt.show()
While the colors of the publications are based on the topic the publication is salient on (from the LDA), we make no assumption that these are the clusters that are being used.
Hence, now we use some clustering algorithm to cluster our data such that we can color them accordingly. In our clustering algorithms, we have 20 clusters as well, similar to the topics.
from sklearn.cluster import KMeans
n_cluster = 20¶kmeansClustering = KMeans(n_clusters=20).fit_predict(data)
f = plt.figure(figsize=(20,10))
ax = f.add_subplot(121)
ax.scatter(x_components[:,0], x_components[:,1], c=kmeansClustering, cmap=plt.cm.jet, s=8)
ax.legend(loc='best')
ax.axis('off')
ax.set_title('clustering based on kMeans (using TSNE)')
ax2 = f.add_subplot(122)
ax2.scatter(x_components[:,0], x_components[:,1], c=list(pub_toks.best_topic), cmap=plt.cm.jet, s=8)
ax2.legend(loc='best')
ax2.axis('off')
ax2.set_title('clustering based on best topic')
plt.show()
f = plt.figure(figsize=(20,10))
ax = f.add_subplot(121)
ax.scatter(mds_components[:,0], mds_components[:,1], c=kmeansClustering, cmap=plt.cm.jet, s=20)
ax.legend(loc='best')
ax.axis('off')
ax.set_title('Clustering baed on kMeans (mds)')
ax2 = f.add_subplot(122)
ax2.scatter(mds_components[:,0], mds_components[:,1], c=list(pub_toks.best_topic), cmap=plt.cm.jet, s=20)
ax2.legend(loc='best')
ax2.axis('off')
ax2.set_title('Clustering based on best_topic (mds)')
plt.show()
plt.show()
n_cluster = 30¶kmeansClustering30 = KMeans(n_clusters=30).fit_predict(data)
f = plt.figure(figsize=(20,10))
ax = f.add_subplot(121)
ax.scatter(x_components[:,0], x_components[:,1], c=kmeansClustering30, cmap=plt.cm.jet, s=8)
ax.legend(loc='best')
ax.axis('off')
ax.set_title('clustering based on kMeans (using TSNE)')
ax2 = f.add_subplot(122)
ax2.scatter(x_components[:,0], x_components[:,1], c=list(pub_toks.best_topic), cmap=plt.cm.jet, s=8)
ax2.legend(loc='best')
ax2.axis('off')
ax2.set_title('clustering based on best topic')
plt.show()
n_cluster = 10¶kmeansClustering10 = KMeans(n_clusters=10).fit_predict(data)
f = plt.figure(figsize=(20,10))
ax = f.add_subplot(121)
ax.scatter(x_components[:,0], x_components[:,1], c=kmeansClustering10, cmap=plt.cm.jet, s=8)
ax.legend(loc='best')
ax.axis('off')
ax.set_title('clustering based on kMeans (using TSNE)')
ax2 = f.add_subplot(122)
ax2.scatter(x_components[:,0], x_components[:,1], c=list(pub_toks.best_topic), cmap=plt.cm.jet, s=8)
ax2.legend(loc='best')
ax2.axis('off')
ax2.set_title('clustering based on best topic')
plt.show()
from sklearn.cluster import DBSCAN
dbscan_pub = DBSCAN().fit(data)
dbscan_clusters = dbscan_pub.labels_
n_clusters_ = len(set(dbscan_clusters)) - (1 if -1 in dbscan_clusters else 0)
print 'number of clusters:', n_clusters_
dbscan_pub = DBSCAN().fit(x_components)
dbscan_clusters = dbscan_pub.labels_
n_clusters_ = len(set(dbscan_clusters)) - (1 if -1 in dbscan_clusters else 0)
print 'number of clusters:', n_clusters_
f = plt.figure(figsize=(20, 10))
ax = f.add_subplot(121)
color_palette = sns.color_palette('Paired', 203)
cluster_colors = [
color_palette[x] if x >= 0 else (0.5, 0.5, 0.5) for x in dbscan_clusters
]
ax.scatter(
x_components[:, 0], x_components[:, 1], c=cluster_colors, s=50, alpha=.25)
ax.legend(loc='best')
ax.axis('off')
ax.set_title('clustering based on hdbscan (using TSNE)')
ax2 = f.add_subplot(122)
ax2.scatter(
x_components[:, 0],
x_components[:, 1],
c=list(pub_toks.best_topic),
cmap=plt.cm.jet,
s=50,
alpha=.25)
ax2.legend(loc='best')
ax2.axis('off')
ax2.set_title('clustering based on best topic')
plt.show()
dbscan_pub_mds = DBSCAN().fit(mds_components)
dbscan_pub_mds_clusters = dbscan_pub_mds.labels_
n_clusters_ = len(set(dbscan_pub_mds_clusters)) - (1 if -1 in dbscan_pub_mds_clusters else 0)
print 'number of clusters:', n_clusters_
from hdbscan import HDBSCAN
import hdbscan
hdbscan_cluster = HDBSCAN().fit(data)
n_clusters_ = len(set(hdbscan_cluster.labels_)) - (1 if -1 in hdbscan_cluster.labels_ else 0)
print 'number of clusters:', n_clusters_
f = plt.figure()
ax = f.add_subplot(111)
ax.hist(hdbscan_cluster.labels_,bins=50);
plt.show()
f = plt.figure(figsize=(20, 10))
ax = f.add_subplot(121)
color_palette = sns.color_palette('Paired', 102)
cluster_colors = [
color_palette[x] if x >= 0 else (0.5, 0.5, 0.5)
for x in hdbscan_cluster.labels_
]
# colors weighted according to te probability of being in the cluster
cluster_member_colors = [
sns.desaturate(x, p)
for x, p in zip(cluster_colors, hdbscan_cluster.probabilities_)
]
ax.scatter(
*x_components.T, s=50, linewidth=0, c=cluster_member_colors, alpha=0.25)
# ax.scatter(x_components[:,0], x_components[:,1], c=hdbscan_cluster.labels_, cmap=plt.cm.jet, s=8)
ax.legend(loc='best')
ax.axis('off')
ax.set_title('clustering based on HDBSCAN (using TSNE)')
ax2 = f.add_subplot(122)
ax2.scatter(
x_components[:, 0],
x_components[:, 1],
c=list(pub_toks.best_topic),
cmap=plt.cm.jet,
s=50,
alpha=.25)
ax2.legend(loc='best')
ax2.axis('off')
ax2.set_title('clustering based on best topic')
plt.show()
clusterer = HDBSCAN(prediction_data=True).fit(data)
n_clusters_ = len(set(hdbscan_cluster.labels_)) - (1 if -1 in hdbscan_cluster.labels_ else 0)
print 'number of clusters:', n_clusters_
f = plt.figure()
ax = f.add_subplot(111)
ax.hist(clusterer.labels_,bins=50);
plt.show()
f = plt.figure(figsize=(20, 20))
ax = f.add_subplot(221)
soft_clusters = hdbscan.all_points_membership_vectors(clusterer)
color_palette = sns.color_palette('Paired', 102)
cluster_colors = [color_palette[np.argmax(x)] for x in soft_clusters]
ax.scatter(*x_components.T, s=50, linewidth=0, c=cluster_colors, alpha=0.25)
# ax.scatter(x_components[:,0], x_components[:,1], c=hdbscan_cluster.labels_, cmap=plt.cm.jet, s=8)
ax.legend(loc='best')
ax.axis('off')
ax.set_title('soft clustering based on HDBSCAN (using TSNE)')
ax2 = f.add_subplot(222)
ax2.scatter(
x_components[:, 0],
x_components[:, 1],
c=list(pub_toks.best_topic),
cmap=plt.cm.jet,
s=50,
alpha=.25)
ax2.legend(loc='best')
ax2.axis('off')
ax2.set_title('clustering based on best topic')
ax3 = f.add_subplot(223)
cluster_colors = [
sns.desaturate(color_palette[np.argmax(x)], np.max(x))
for x in soft_clusters
]
ax3.scatter(*x_components.T, s=50, linewidth=0, c=cluster_colors, alpha=0.25)
ax3.axis('off')
ax3.set_title(
'soft clustering based on HDBSCAN, desaturated based on cluster probability (using TSNE)'
)
plt.show()
# Create a new pandas table that merge the lookup_poinf and lookup_pub
# each pub in lookup_pub have a collab_id that have a list of collaborators by id
# We can ignore those that are not in the list of id for PoInf
# Create the list of ids for easy checking:
poinf_id = set(lookup_poinf.index)
# we can now create such an index:
pub_mapping = {str(_id):set() for _id in list(poinf_id)}
for row in lookup_pub.iterrows():
pub_id = row[0]
collab_ids = row[1]['collab_id']
for _id in collab_ids:
if _id in poinf_id:
pub_mapping[_id].add(pub_id)
row_list = [{'id':k, 'pub_ids':v} for (k,v) in pub_mapping.items()]
# Add these pub_ids to the pandas df:
df_pubmapping = pd.DataFrame(row_list)
lookup_poinf_more = lookup_poinf.join(df_pubmapping.set_index('id'))
lookup_poinf_more.iloc[20:24]
def getToks(pub_ids):
out = []
try:
if len(pub_ids):
for pub_id in pub_ids:
out.extend(pub_toks[pub_toks.index == pub_id].summary_toks)
# Convert the list of lists to a single list:
out = [tok for tokList in out for tok in tokList]
except TypeError:
print(pub_ids)
return out
lookup_poinf_more['summary_toks'] = lookup_poinf_more.apply(lambda row: getToks(row.pub_ids), axis=1)
# Conert to BOW using bowified:
lookup_poinf_more['bow'] = lookup_poinf_more.apply(bowified, axis=1)
lookup_poinf_more['topic_distribution'] = lookup_poinf_more.apply(_inference, axis=1)
lookup_poinf_more.head(2)
len(lookup_poinf_more)
lookup_poinf_more['remove_drop'] = lookup_poinf_more.apply(lambda row: len(row.bow) == 0, axis=1)
# Remove individuals that does not have any bow:
lookup_poinf_more_drop = lookup_poinf_more.drop(lookup_poinf_more[lookup_poinf_more.remove_drop==True].index)
len(lookup_poinf_more_drop)
lookup_poinf_more_drop['best_topic'] = lookup_poinf_more_drop.apply(lambda row: best_topic(row.topic_distribution), axis=1)
lookup_poinf_more_drop.head(2)
## Our dataset will be the topic_distribution:
data_poinf = lookup_poinf_more_drop.topic_distribution.values
data_poinf = list(data_poinf)
poinf_tsne = manifold.TSNE(n_components=2, init='pca', random_state=rng).fit_transform(data_poinf)
TSNE¶f = plt.figure(figsize=(10, 10))
ax = f.add_subplot(111)
ax.scatter(
poinf_tsne[:, 0],
poinf_tsne[:, 1],
c=list(lookup_poinf_more_drop.best_topic),
cmap=plt.cm.jet,
s=50,
alpha=.5)
ax.legend(loc='best')
ax.axis('off')
plt.show()
poinf_tsne.shape
MDS¶# Multidimensional Scaling
mds_poinf = manifold.MDS(n_components=2, random_state=rng).fit_transform(data_poinf)
f = plt.figure(figsize=(10,10))
ax = f.add_subplot(111)
ax.scatter(mds_poinf[:,0], mds_poinf[:,1], c=list(lookup_poinf_more_drop.best_topic), cmap=plt.cm.jet, s=50, alpha=.5)
ax.legend(loc='best')
ax.axis('off')
plt.show()
kmeansClustering = KMeans(n_clusters=20).fit_predict(data_poinf)
f = plt.figure(figsize=(20, 10))
ax = f.add_subplot(121)
ax.scatter(
poinf_tsne[:, 0],
poinf_tsne[:, 1],
c=kmeansClustering,
cmap=plt.cm.jet,
s=50,
alpha=.5)
ax.legend(loc='best')
ax.axis('off')
ax.set_title('clustering based on kMeans (using TSNE)')
ax2 = f.add_subplot(122)
ax2.scatter(
poinf_tsne[:, 0],
poinf_tsne[:, 1],
c=list(lookup_poinf_more_drop.best_topic),
cmap=plt.cm.jet,
s=50,
alpha=.5)
ax2.legend(loc='best')
ax2.axis('off')
ax2.set_title('clustering based on best topic')
plt.show()
f = plt.figure(figsize=(20, 10))
ax = f.add_subplot(121)
ax.scatter(
mds_poinf[:, 0],
mds_poinf[:, 1],
c=kmeansClustering,
cmap=plt.cm.jet,
s=50,
alpha=.5)
ax.legend(loc='best')
ax.axis('off')
ax.set_title('Clustering baed on kMeans (mds)')
ax2 = f.add_subplot(122)
ax2.scatter(
mds_poinf[:, 0],
mds_poinf[:, 1],
c=list(lookup_poinf_more_drop.best_topic),
cmap=plt.cm.jet,
s=50,
alpha=.5)
ax2.legend(loc='best')
ax2.axis('off')
ax2.set_title('Clustering based on best_topic (mds)')
plt.show()
plt.show()
dbscan = DBSCAN(min_samples=1, algorithm='ball_tree', leaf_size=2).fit(data_poinf)
dbscan_clusters = dbscan.labels_
n_clusters_ = len(set(dbscan_clusters)) - (1 if -1 in dbscan_clusters else 0)
print 'number of clusters:', n_clusters_
dbscan_clusters
# Try with the tsne data:
dbscan_tsne = DBSCAN().fit(x)
dbscan_tsne_clusters = dbscan_tsne.labels_
n_clusters_tsne = len(set(dbscan_tsne_clusters)) - (1 if -1 in dbscan_tsne_clusters else 0)
print 'number of clusters:', n_clusters_
hdbscan_cluster_poinf = HDBSCAN().fit(data_poinf)
n_clusters_ = len(set(hdbscan_cluster_poinf.labels_)) - (1 if -1 in hdbscan_cluster_poinf.labels_ else 0)
print 'number of clusters:', n_clusters_
f = plt.figure()
ax = f.add_subplot(111)
ax.hist(hdbscan_cluster_poinf.labels_,bins=50);
plt.show()
f = plt.figure(figsize=(20,10))
ax = f.add_subplot(121)
color_palette = sns.color_palette('husl', 103)
cluster_colors = [color_palette[x] if x >= 0
else (0.5, 0.5, 0.5)
for x in hdbscan_cluster_poinf.labels_]
# colors weighted according to te probability of being in the cluster
cluster_member_colors = [sns.desaturate(x, p) for x, p in
zip(cluster_colors, hdbscan_cluster_poinf.probabilities_)]
ax.scatter(*poinf_tsne.T, s=50, linewidth=0, c=cluster_member_colors, alpha=0.25)
# ax.scatter(x_components[:,0], x_components[:,1], c=hdbscan_cluster.labels_, cmap=plt.cm.jet, s=8)
ax.legend(loc='best')
ax.axis('off')
ax.set_title('clustering based on HDBSCAN (using TSNE)')
ax2 = f.add_subplot(122)
ax2.scatter(poinf_tsne[:,0], poinf_tsne[:,1], c=list(lookup_poinf_more_drop.best_topic), cmap=plt.cm.jet, s=50, alpha=.5)
ax2.legend(loc='best')
ax2.axis('off')
ax2.set_title('clustering based on best topic')
plt.show()
cluster_poinf = HDBSCAN(prediction_data=True).fit(data_poinf)
n_clusters_ = len(set(cluster_poinf.labels_)) - (1 if -1 in cluster_poinf.labels_ else 0)
print 'number of clusters:', n_clusters_
f = plt.figure()
ax = f.add_subplot(111)
ax.hist(cluster_poinf.labels_,bins=50);
plt.show()
f = plt.figure(figsize=(16, 16))
ax = f.add_subplot(221)
soft_clusters = hdbscan.all_points_membership_vectors(cluster_poinf)
color_palette = sns.color_palette('husl', 2)
cluster_colors = [color_palette[np.argmax(x)] for x in soft_clusters]
ax.scatter(*poinf_tsne.T, s=70, linewidth=0, c=cluster_colors, alpha=0.5)
# ax.scatter(x_components[:,0], x_components[:,1], c=hdbscan_cluster.labels_, cmap=plt.cm.jet, s=8)
ax.legend(loc='best')
ax.axis('off')
ax.set_title('soft clustering based on HDBSCAN (using TSNE)')
ax2 = f.add_subplot(222)
ax2.scatter(
poinf_tsne[:, 0],
poinf_tsne[:, 1],
c=list(lookup_poinf_more_drop.best_topic),
cmap=plt.cm.jet,
s=70,
alpha=.5)
ax2.legend(loc='best')
ax2.axis('off')
ax2.set_title('clustering based on best topic')
ax3 = f.add_subplot(223)
cluster_colors = [
sns.desaturate(color_palette[np.argmax(x)], np.max(x))
for x in soft_clusters
]
ax3.scatter(*poinf_tsne.T, s=70, linewidth=0, c=cluster_colors, alpha=0.5)
ax3.axis('off')
ax3.set_title(
'soft clustering based on HDBSCAN, desaturated based on cluster probability (using TSNE)'
)
plt.show()
soft_clusters.shape
classDef = np.argmax(soft_clusters,axis=1)
class1 = classDef[classDef==0]
class2 = classDef[classDef==1]
lookup_poinf_more_drop.iloc[classDef==1]
lookup_poinf_more_drop.iloc[classDef==0]